$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json

name: jsonl_random_examples
version: 0.0.1pre1
display_name: JSONL Random Examples
type: command
description: |
  Takes two JSONL files, 'input' and 'examples'.
  For each line in the input, selects a random number of examples
  to include in the output key
is_deterministic: true

inputs:
  input_dataset:
    type: uri_file
    optional: false
    description: Dataset containing JSONL input
  input_encoding:
    type: string
    optional: false
    default: utf-8-sig
    description: Encoding format of the input dataset
  example_dataset:
    type: uri_file
    optional: false
    description: Dataset containing JSONL example data
  example_encoding:
    type: string
    optional: false
    default: utf-8-sig
    description: Encoding format of the example dataset
  output_encoding:
    type: string
    optional: false
    default: utf-8-sig
    description: Encoding format of the output dataset
  output_key:
    type: string
    optional: false
    description: Key in which to store the list of examples
  num_examples:
    type: integer
    optional: false
    description: How many examples to select
  random_seed:
    type: integer
    optional: false
    description: Seed for selecting random numbers
  
  
outputs:
  output_dataset:
    type: uri_file
    description: JSONL file containing inputs with examples appended


code: ./src/

command: >-
  python ./jsonl_random_examples.py
  --input_dataset ${{ inputs.input_dataset }}
  --input_encoding ${{ inputs.input_encoding }}
  --example_dataset ${{ inputs.example_dataset }}
  --example_encoding ${{ inputs.example_encoding }}
  --output_dataset ${{ outputs.output_dataset }}
  --output_encoding ${{ inputs.output_encoding }}
  --output_key ${{ inputs.output_key }}
  --num_examples ${{ inputs.num_examples }}
  --random_seed ${{ inputs.random_seed }}


environment:
  # Will be updated when component uploads
  image: azureml:promptbase_aml@latest
